import pandas as pd
import urllib
import numpy as np
import urllib.request
import re
from textblob import TextBlob
%run lib.py
#name="Legally%20Blonde"
#name="aboutmary"
#name="10Things"
name="magnolia"
#name="Friday%20The%2013th"
#name="Ghost%20Ship"
#name="Juno"
#name="Reservoir+Dogs"
#name="shawshank"
#name="Sixth%20Sense,%20The"
#name="sunset_bld_3_21_49"
#name="Titanic"
#name="toy_story"
#name="trainspotting"
#name="transformers"
#name="the-truman-show_shooting"
#name="batman_production"
ext="html"
txtfiles=["Ghost%20Ship", "Legally%20Blonde", "Friday%20The%2013th", "Juno", "Reservoir+Dogs", "Sixth%20Sense,%20The", "Titanic"]
if name in txtfiles:
ext="txt"
fp = urllib.request.urlopen("http://www.dailyscript.com/scripts/"+name+"."+ext)
mybytes = fp.read()
mystr = mybytes.decode("utf8", "ignore")
fp.close()
liston=mystr.split("\n")
liston=[s.replace('\r', '') for s in liston]
liston=[re.sub('<[^<]+?>', '', text) for text in liston]
if name=="shawshank":
liston=[i.replace("\t", " ") for i in liston]
char=""
script=[]
charintro=' '
endofdialogue=' '
dialoguepre=' '
newscenepre=' '
charintro=''
endofdialogue=''
dialoguepre=''
newscenepre=''
i=45
print("Characters")
i, charintro=nextbigchunk(liston, i)
print("Adverbs")
i, adverb=nextbigchunk(liston, i, adverbs=True)
print("Dialogues")
i, dialoguepre=nextbigchunk(liston, i)
print("New Scene:")
i, newscenepre=nextbigchunk(liston, i)
if newscenepre=="X":
i=100
i, newscenepre=nextbigchunk(liston, i)
if name=="aboutmary":
newscenepre=" ".join(["" for i in range(56)])
if len(newscenepre)==len(charintro):
newscenepre="X"
endofdialogue=newscenepre
scene=1
for s in liston:
if s[0:len(charintro)]==charintro and s[len(charintro)]!=" " and s.strip()[0]!="(" and s.strip()[len(s.strip())-1]!=")":
#print("Charatcer*****")
char=s[len(charintro):]
new=dict()
new['char']=char.strip()
new['dialogue']=""
new['scene']=scene
new['adverb']=""
if s==endofdialogue or s.replace(" ", "")=="":
if char!="":
char=""
script.append(new)
if char!="" and s[0:len(dialoguepre)]==dialoguepre and s[len(dialoguepre)]!=" ":
#print("Dialogue******")
if new['dialogue']!="":
new['dialogue']=new['dialogue']+" "
new['dialogue']=new['dialogue']+s[len(dialoguepre):]
if char!="" and ((s[0:len(adverb)]==adverb and s[len(adverb)]!=" ") or (len(s)>1 and s.strip()[0]=="(" and s.strip()[len(s.strip())-1]==")" )):
if new['adverb']!="":
new['adverb']=new['adverb']+" "
new['adverb']=new['adverb']+s[len(adverb):]
if s[0:len(newscenepre)]==newscenepre and len(s)>len(newscenepre) and ( s.isupper()) and s[len(newscenepre)]!=" ":
scene=scene+1
pd.DataFrame(script).to_csv(name+'.csv', index=None)
pd.DataFrame(script)
magnolia=pd.read_csv(name+'.csv')
stopwords = getstopwords()
removedchars=["'S VOICE", "'S WHISPER VOICE", " GATOR"]
for s in removedchars:
magnolia['char']=magnolia['char'].apply(lambda x: x.replace(s, ""))
i=0
scenes=dict()
for s in magnolia.iterrows():
scenes[s[1]['scene']]=[]
for s in magnolia.iterrows():
scenes[s[1]['scene']].append(s[1]['char'])
for s in magnolia.iterrows():
scenes[s[1]['scene']]=list(set(scenes[s[1]['scene']]))
characters=[]
for s in scenes:
for k in scenes[s]:
characters.append(k)
characters=list(set(characters))
appearances=dict()
for s in characters:
appearances[s]=0
for s in magnolia.iterrows():
appearances[s[1]['char']]=appearances[s[1]['char']]+1
a=pd.DataFrame(appearances, index=[i for i in range(len(appearances))])
finalcharacters=[]
for s in pd.DataFrame(a.transpose()[0].sort_values(0, ascending=False))[0:10].iterrows():
finalcharacters.append(s[0])
finalcharacters
file=open(name+"_nodes.csv", "w")
couplesappearances=dict()
for s in finalcharacters:
file.write(";")
file.write(s)
file.write("\n")
for s in finalcharacters:
newlist=[]
for f in finalcharacters:
newlist.append(0)
couplesappearances[f+"_"+s]=0
j=0
for f in finalcharacters:
for p in scenes:
if f in scenes[p] and s in scenes[p] and f!=s and finalcharacters.index(f)<finalcharacters.index(s):
long=len(magnolia[magnolia["scene"]==p])
newlist[j]=newlist[j]+long
couplesappearances[f+"_"+s]=couplesappearances[f+"_"+s]+long
j=j+1
file.write(s)
for f in newlist:
file.write(";")
file.write(str(f))
file.write("\n")
file.close()
a=pd.DataFrame(couplesappearances, index=[i for i in range(len(couplesappearances))])
finalcouples=[]
for s in pd.DataFrame(a.transpose()[0].sort_values(0, ascending=False))[0:4].iterrows():
finalcouples.append(s[0])
file=open(name+"_finalcharacters.csv", "w")
for s in finalcharacters:
file.write(s+"\n")
file.close()
file=open(name+"_finalcouples.csv", "w")
for s in finalcouples:
file.write(s+"\n")
file.close()
importantchars=[]
for char in appearances:
if appearances[char]>10:
importantchars.append(char)
file=open(name+"_sentiment_overtime_individual.csv", "w")
file2=open(name+"_sentiment_overtime_individualminsmaxs.csv", "w")
for k in finalcharacters:
print(k)
dd=getdialogue(magnolia, k, k, scenes)
dd=[str(d) for d in dd]
polarities, subjectivities=getsentiment(dd)
%matplotlib inline
import matplotlib.pyplot as plt
moveda=maverage(polarities, dd, .99)
plt.plot(moveda)
i=0
for s in moveda:
file.write(k+","+str(float(i)/len(moveda))+", "+str(s)+"\n")
i=i+1
plt.ylabel('polarities')
plt.show()
file2.write(k+"| MIN| "+dd[moveda.index(np.min(moveda))]+"\n")
file2.write(k+"| MAX| "+dd[moveda.index(np.max(moveda))]+"\n")
print("MIN: "+dd[moveda.index(np.min(moveda))])
print("\n")
print("MAX: "+dd[moveda.index(np.max(moveda))])
file.close()
file2.close()
file=open(name+"_sentiment_overtime_couples.csv", "w")
file2=open(name+"_sentiment_overtime_couplesminsmaxs.csv", "w")
for k in finalcouples:
print(k)
liston=k.split("_")
dd=getdialogue(magnolia, liston[0], liston[1], scenes)
dd=[str(d) for d in dd]
polarities, subjectivities=getsentiment(dd)
%matplotlib inline
import matplotlib.pyplot as plt
moveda=maverage(polarities, dd, .99)
plt.plot(moveda)
i=0
for s in moveda:
file.write(k+","+str(float(i)/len(moveda))+", "+str(s)+"\n")
i=i+1
plt.ylabel('polarities')
plt.show()
file2.write(k+"| MIN| "+dd[moveda.index(np.min(moveda))]+"\n")
file2.write(k+"| MAX| "+dd[moveda.index(np.max(moveda))]+"\n")
print("MIN: "+dd[moveda.index(np.min(moveda))])
print("\n")
print("MAX: "+dd[moveda.index(np.max(moveda))])
file.close()
file2.close()
for key, val in scenes.items():
for s in scenes[key]:
new="INSCENE_"+scenes[key][0]
scenes[key].remove(scenes[key][0])
scenes[key].append(new)
magnolia.dropna(subset=['dialogue'])
1
baskets=[]
spchars=["\"", "'", ".", ",", "-"]
attributes=["?", "!"]
for s in magnolia.iterrows():
if type(s[1]['dialogue'])!=float and len(s[1]['dialogue'])>0:
new=[]
for k in scenes[s[1]['scene']]:
new.append(k)
new.append("SPEAKING_"+s[1]['char'])
for k in s[1]['dialogue'].split(" "):
ko=k
for t in spchars:
ko=ko.replace(t, "")
for t in attributes:
if ko.find(t)>=0:
new.append(t)
ko=ko.replace(t, "")
if len(ko)>0:
new.append(ko.lower())
new=list(set(new))
baskets.append(new)
baskets2=[]
basketslist=[]
for k in baskets:
new=dict()
new2=[]
for t in k:
if t not in stopwords:
new[t]=1
new2.append(t)
baskets2.append(new)
basketslist.append(new2)
baskets2=pd.DataFrame(baskets2)
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
baskets2=baskets2.fillna(0)
baskets2.to_csv(name+'_basket.csv')
frequent_itemsets = apriori(baskets2, min_support=5/len(baskets2), use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules['one_lower']=[int(alllower(i) or alllower(j)) for i, j in zip(rules['antecedants'], rules['consequents'])]
rules['both_lower']=[int(alllower(i) and alllower(j)) for i, j in zip(rules['antecedants'], rules['consequents'])]
rules.to_csv(name+'_rules.csv', index=None)
| . |
|---|
| WHISPERED VOICE |
| Palabras Distintas |
|---|
| 1919 |
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.577632 | 11.5% |
| sentiment | Porcentaje |
|---|---|
| positive | 65.4% |
| negative | 34.6% |
| sentiment | Porcentaje |
|---|---|
| positive | 21.6% |
| negative | 12.9% |
| trust | 12.8% |
| joy | 12.7% |
| anticipation | 11.5% |
| fear | 6.2% |
| disgust | 5.8% |
| sadness | 5.5% |
| surprise | 5.5% |
| anger | 5.4% |
| sentiment | Porcentaje |
|---|---|
| negative | 34.1% |
| positive | 31.4% |
| uncertainty | 28.4% |
| litigious | 5.7% |
| superfluous | 0.4% |
[1] “Analisis de Sentimientos del Personaje: JUNO” [1] “Numero total de Palabras Unicas en el texto: 1173”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.599455 | 12.2% |
| sentiment | Porcentaje |
|---|---|
| positive | 64.3% |
| negative | 35.7% |
| sentiment | Porcentaje |
|---|---|
| positive | 21.6% |
| joy | 13.7% |
| negative | 12.6% |
| trust | 12.5% |
| anticipation | 10.6% |
| disgust | 6.3% |
| fear | 6.2% |
| sadness | 6.0% |
| surprise | 5.6% |
| anger | 5.1% |
| sentiment | Porcentaje |
|---|---|
| positive | 33.0% |
| negative | 32.1% |
| uncertainty | 31.1% |
| litigious | 3.8% |
[1] “Analisis de Sentimientos del Personaje: MARK” [1] “Numero total de Palabras Unicas en el texto: 517”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.488636 | 9.48% |
| sentiment | Porcentaje |
|---|---|
| positive | 67% |
| negative | 33% |
| sentiment | Porcentaje |
|---|---|
| positive | 21.0% |
| negative | 14.6% |
| anticipation | 13.1% |
| trust | 13.1% |
| joy | 9.4% |
| fear | 7.9% |
| anger | 6.0% |
| disgust | 5.2% |
| sadness | 4.9% |
| surprise | 4.9% |
| sentiment | Porcentaje |
|---|---|
| positive | 43.8% |
| negative | 25.0% |
| uncertainty | 18.8% |
| litigious | 9.4% |
| superfluous | 3.1% |
[1] “Analisis de Sentimientos del Personaje: VANESSA” [1] “Numero total de Palabras Unicas en el texto: 335”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.66 | 9.85% |
| sentiment | Porcentaje |
|---|---|
| positive | 74.5% |
| negative | 25.5% |
| sentiment | Porcentaje |
|---|---|
| positive | 22.2% |
| joy | 16.4% |
| trust | 14.8% |
| anticipation | 13.8% |
| negative | 11.6% |
| sadness | 5.8% |
| surprise | 5.3% |
| fear | 4.2% |
| anger | 3.2% |
| disgust | 2.6% |
| sentiment | Porcentaje |
|---|---|
| negative | 36.8% |
| uncertainty | 36.8% |
| positive | 21.1% |
| litigious | 5.3% |
[1] “Analisis de Sentimientos del Personaje: BLEEKER” [1] “Numero total de Palabras Unicas en el texto: 290”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.770833 | 8.62% |
| sentiment | Porcentaje |
|---|---|
| positive | 68.1% |
| negative | 31.9% |
| sentiment | Porcentaje |
|---|---|
| positive | 21.4% |
| anticipation | 16.7% |
| trust | 11.9% |
| joy | 11.1% |
| negative | 11.1% |
| fear | 7.9% |
| anger | 5.6% |
| surprise | 5.6% |
| sadness | 4.8% |
| disgust | 4.0% |
| sentiment | Porcentaje |
|---|---|
| negative | 50% |
| positive | 30% |
| uncertainty | 20% |
[1] “Analisis de Sentimientos del Personaje: LEAH” [1] “Numero total de Palabras Unicas en el texto: 315”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.381818 | 12.7% |
| sentiment | Porcentaje |
|---|---|
| positive | 61.5% |
| negative | 38.5% |
| sentiment | Porcentaje |
|---|---|
| positive | 23.4% |
| joy | 14.6% |
| trust | 12.4% |
| negative | 10.2% |
| anticipation | 8.8% |
| disgust | 7.3% |
| fear | 7.3% |
| anger | 6.6% |
| sadness | 5.8% |
| surprise | 3.6% |
| sentiment | Porcentaje |
|---|---|
| uncertainty | 60% |
| negative | 30% |
| positive | 10% |
[1] “Analisis de Sentimientos del Personaje: MAC” [1] “Numero total de Palabras Unicas en el texto: 281”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 5.833333 | 12.5% |
| sentiment | Porcentaje |
|---|---|
| positive | 74.4% |
| negative | 25.6% |
| sentiment | Porcentaje |
|---|---|
| positive | 20.6% |
| joy | 15.4% |
| trust | 15.4% |
| negative | 13.2% |
| anticipation | 11.8% |
| surprise | 7.4% |
| anger | 5.9% |
| disgust | 4.4% |
| sadness | 3.7% |
| fear | 2.2% |
| sentiment | Porcentaje |
|---|---|
| positive | 46.2% |
| negative | 30.8% |
| litigious | 15.4% |
| uncertainty | 7.7% |
[1] “Analisis de Sentimientos del Personaje: BREN” [1] “Numero total de Palabras Unicas en el texto: 277”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 4.864865 | 10.8% |
| sentiment | Porcentaje |
|---|---|
| positive | 56.1% |
| negative | 43.9% |
| sentiment | Porcentaje |
|---|---|
| positive | 20.0% |
| negative | 16.0% |
| anticipation | 11.2% |
| trust | 11.2% |
| joy | 9.6% |
| disgust | 8.8% |
| fear | 6.4% |
| sadness | 6.4% |
| surprise | 5.6% |
| anger | 4.8% |
| sentiment | Porcentaje |
|---|---|
| uncertainty | 50% |
| negative | 30% |
| positive | 20% |
[1] “Analisis de Sentimientos del Personaje: ULTRASOUND TECH” [1] “Numero total de Palabras Unicas en el texto: 69”
| Descripcion | Score | % Founded Words |
|---|---|---|
| Entre 0 (negativo) y 10 (positivo) | 6 | 7.25% |
| sentiment | Porcentaje |
|---|---|
| positive | 71.4% |
| negative | 28.6% |
| sentiment | Porcentaje |
|---|---|
| positive | 17.4% |
| trust | 17.4% |
| anticipation | 13.0% |
| joy | 13.0% |
| surprise | 13.0% |
| negative | 8.7% |
| anger | 4.3% |
| disgust | 4.3% |
| fear | 4.3% |
| sadness | 4.3% |
Table: Porcentaje de Palabras encontradas por tipo de sentimiento ( loughran ) 0%
sentiment Porcentaje ———- ————
| Personaje | Min_Max | Dialogo |
|---|---|---|
| JUNO | MIN | Oh, I couldn’t copy your work. |
| JUNO | MAX | I’m going to say I’m 104% sure. |
| MARK | MIN | Nesting, huh? Are you planning to build the crib out of twigs and saliva? |
| MARK | MAX | I called Gerta Rauss. She says she can represent both of us. They call it “collaborative divorce.” It’s apparently all the rage right now. And it’s easy because we don’t have children. |
| VANESSA | MIN | Then what’s going on? |
| VANESSA | MAX | That’s great. |
| BLEEKER | MIN | Why? |
| BLEEKER | MAX | I always think you’re cute. I think you’re beautiful. |
| LEAH | MIN | Oh yeah! “Desperately Seeking Spawn.” They’re right by the ads for like, iguanas and terriers and used fitness equipment. It’s totally legit. |
| LEAH | MAX | You know, you can go into early labor sucking face like that! |
| MAC | MIN | You mean like couples? |
| MAC | MAX | We’re fine, thank you. |
| BREN | MIN | They could be utterly negligent. Maybe they’ll do a far shittier job of raising a kid than my dumbass stepdaughter ever would. Have you considered that? |
| BREN | MAX | Oh, go fly a kite. |
| ULTRASOUND TECH | MIN | Well, there we have it. Would you like to know the sex? |
| ULTRASOUND TECH | MAX | Planning to be surprised when you deliver? |
| BLEEKER’S MOM | MIN | Hi Juno. What can I do for you? |
| BLEEKER’S MOM | MAX | Fine. Come in. |
| GERTA | MIN | So, look those over and give me a call at my office if you have any questions. |
| GERTA | MAX | Nice to meet you. |
| Parejas | Min_Max | Dialogo |
|---|---|---|
| JUNO_MARK | MIN | I don’t get a klepto vibe from you. Evil genius? Maybe. Arsonist? Wouldn’t rule it out. |
| JUNO_MARK | MAX | I think it kind of looks like my friend, Paulie. |
| JUNO_LEAH | MIN | The Penny Saver sucks. |
| JUNO_LEAH | MAX | You know, you can go into early labor sucking face like that! |
| JUNO_VANESSA | MIN | Juno, we’d really appreciate it if you could keep us updated on any doctor’s appointments, ultrasounds, other things of that nature. |
| JUNO_VANESSA | MAX | That’s great. |
| MARK_VANESSA | MIN | Juno, we’d really appreciate it if you could keep us updated on any doctor’s appointments, ultrasounds, other things of that nature. |
| MARK_VANESSA | MAX | Um, I think people are kind of unsure about the situation because it’s not, you know, set in stone. |
## [1] "Lift Promedio de las Reglas de Asociacion: 9.65333583138717"
## [1] "Desviación estandar del Lift de las Reglas de Asociacion: 15.2861337489747"
## [1] "Deciles del Lift : "
## 10% 20% 30% 40% 50% 60%
## 1.096774 1.548387 2.172524 2.720000 4.000000 5.224072
## 70% 80% 90% 100%
## 7.311828 13.161290 26.493506 116.571429
| Numero de Dialogos | Lift Minimo | Lift Maximo |
|---|---|---|
| 7,226 | -2 | 2 |
| 9,264 | 2 | 6 |
| 2,676 | 6 | 10 |
| 2,222 | 10 | 14 |
| 346 | 14 | 18 |
| 204 | 18 | 22 |
## [1] "Leverage Promedio de las Reglas de Asociacion: 0.00670772264385702"
## [1] "Desviación estandar del Leverage de las Reglas de Asociacion: 0.00686770072427445"
## [1] "Deciles del Leverage : "
## 10% 20% 30% 40% 50% 60%
## 0.001081315 0.002763360 0.004062440 0.004924488 0.005583790 0.005954741
## 70% 80% 90% 100%
## 0.007145689 0.008389201 0.011514502 0.098958333
| Numero de Dialogos | Leverage Minimo | Leverage Maximo |
|---|---|---|
| 3,314 | -0.0017 | 0.0017 |
| 7,980 | 0.0017 | 0.0051 |
| 9,850 | 0.0051 | 0.0085 |
| 2,122 | 0.0085 | 0.012 |
| 942 | 0.012 | 0.015 |
| 496 | 0.015 | 0.019 |
Pagerank: Reservoir Dogs.